# This file mainly presents how to get sensitivity check's results

# Results can be obtained in this file include:

  ## Appendix 3: Sensitivity Check of GPT-generated Dictionaries in Pathway 1
  ### Figure 3.1: Performance of sub-dictionaries of innovation, impact, and replicability
  ### Figure 3.2: Prediction Accuracy of UNPSA Winners after Swapping the Innovation Dictionary with the Impact or Replicability Dictionary
  ### Figure 3.3: Comparison of Complete Dictionaries and Random Benchmark

## R version:4.4.2

#-----------------Prepare the libraries and the dataset------------------------#
rm(list = ls())  

# Libraries  
library(readxl)  
library(ggplot2)
library(dplyr)
library(tidyr) 

data <- read_excel("~/Desktop/sensitivity.xlsx") 


#---------Figure 3.1: Performance of sub-dictionaries--------------------------#

# 1) Set up a 2x1 graphic layout

par(mfrow = c(2, 1), mar = c(4, 4, 3, 2))

# 2) line plot of Accuracy of Term Size by Model-------------------------------- 

plot(NA, xlim = c(20, 100), ylim = c(0.90, 1.0), 
     xlab = "% of terms", ylab = "Accuracy",
     main = "Accuracy of Term Size by Model",
     xaxt = "n")
axis(1, at = c(20, 40, 60, 80, 100))
grid()

models <- unique(data$Model)
colors <- c("red", "blue", "green", "orange")
line_types <- c(1, 2, 3, 4)  

for (i in 1:length(models)) {
  model_data <- data[data$Model == models[i], ]
  lines(model_data$Term_size, model_data$Accuracy, 
        type = "b", col = colors[i], lty = line_types[i], lwd = 2, pch = 16)
}

legend("bottomleft", legend = paste("Model", models),
       col = colors, lty = line_types, lwd = 2, pch = 16, bty = "n",
       ncol = 2, cex = 0.5, x.intersp = 0.5, y.intersp = 0.5)

# 3) Line plot of F1 Score of Term Size by Model-------------------------------
plot(NA, xlim = c(20, 100), ylim = c(0.1, 1.0), 
     xlab = "% of terms", ylab = "F1 Score",
     main = "F1 Score of Term Size by Model",
     xaxt = "n")
axis(1, at = c(20, 40, 60, 80, 100))
grid()

for (i in 1:length(models)) {
  model_data <- data[data$Model == models[i], ]
  lines(model_data$Term_size, model_data$F1_Score, 
        type = "b", col = colors[i], lty = line_types[i], lwd = 2, pch = 16)
}

legend("topleft", legend = paste("Model", models),
      col = colors, lty = line_types, lwd = 2, pch = 16, bty = "n",
      ncol = 2, cex = 0.5, x.intersp = 0.5, y.intersp = 0.5)

# 4) Combine two plots together
par(mfrow = c(1, 1))

#-------------------------------------------------------------------------------#
#-------------------------------------------------------------------------------#

#Figure 3.2: Prediction Accuracy of UNPSA Winners after Swapping dictionaries--#

# 1) Select the data-----------------------------------------------------------
winner_data <- data %>%
  filter(Term_size == 100) %>%
  select(Model, 
         before_swap = Winner_auccracy_before,
         swap_innovation_impact = Winner_auccracy_swap1,
         swap_innovation_replicability = Winner_auccracy_swap2)

# 2) Convert data from wide format to long format------------------------------
winner_long <- winner_data %>%
  pivot_longer(
    cols = -Model,
    names_to = "condition",
    values_to = "accuracy"
  )

# 3) Set the factor level and labels-------------------------------------------
winner_long$Model <- factor(winner_long$Model, 
                            levels = c(1, 2, 3, 4),
                            labels = c("Model 1", "Model 2", "Model 3", "Model 4"))

winner_long$condition <- factor(winner_long$condition,
                                levels = c("before_swap", "swap_innovation_impact", "swap_innovation_replicability"),
                                labels = c("Before Swap", "Swap Innovation & Impact", "Swap Innovation & Replicability"))

# 4) Creat the line plot
ggplot(winner_long, aes(x = Model, y = accuracy, group = condition, 
                        color = condition, linetype = condition)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  labs(
    x = " ",
    y = "UNPSA Winner Prediction Accuracy",
    color = "Condition",
    linetype = "Condition"
  ) +
  scale_color_manual(
    values = c("Before Swap" = "blue", 
               "Swap Innovation & Impact" = "red", 
               "Swap Innovation & Replicability" = "orange")
  ) +
  scale_linetype_manual(
    values = c("Before Swap" = "solid", 
               "Swap Innovation & Impact" = "dashed", 
               "Swap Innovation & Replicability" = "dotted")
  ) +
  scale_y_continuous(
    labels = scales::percent,
    limits = c(0.1, 1.0)
  ) +
  theme_minimal() +
  theme(
    axis.title = element_text(size = 10),
    axis.text = element_text(size = 10),
    legend.position = "bottom",
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(color = "black"),
    legend.key.width = unit(1, "cm"),
    legend.text = element_text(size = 10),
    legend.title = element_text(size = 10, face = "bold")  
  ) +
  geom_text(data = subset(winner_long, condition == "Before Swap"), 
            aes(label = scales::percent(accuracy, accuracy = 0.1)), 
            vjust = -1, size = 3) +
  geom_text(data = subset(winner_long, condition == "Swap Innovation & Impact"), 
            aes(label = scales::percent(accuracy, accuracy = 0.1)), 
            vjust = -1, size = 3) +
  geom_text(data = subset(winner_long, condition == "Swap Innovation & Replicability"), 
            aes(label = scales::percent(accuracy, accuracy = 0.1)), 
            vjust = 2, size = 3)


#-------------------------------------------------------------------------------#
#-------------------------------------------------------------------------------#

#----Figure 3.3: Comparison of Complete Dictionaries and Random Benchmark------# 

# 1) select the data and convert Term_size to percentage format-----------------
data1 <- data %>%
  select(Term_size, Model, Accuracy, PR_AUC) %>%
  mutate(
    Proportion = paste0(Term_size, "%"),
    Model = paste("Model", Model)
  )  


# 2) Convert Proportion and model to factors------------------------------------ 
data1$Proportion <- factor(data1$Proportion, 
                          levels = c("20%", "40%", "60%", "80%", "100%"))

data1$Model <- factor(data1$Model, 
                     levels = c("Model 1", "Model 2", "Model 3", "Model 4"))

# 3) Accuracy at Different Term Sizes Heatmap-----------------------------------
ggplot(data1, aes(x = Proportion, y = Model, fill = Accuracy)) +
  geom_tile(color = "white", size = 1) +
  geom_text(aes(label = round(Accuracy, 3)), color = "white", size = 4, fontface = "bold") +
  scale_fill_gradient2(low = "#2166ac", mid = "#FFA07A", high = "#b2182b", 
                       midpoint = 0.96, name = "Accuracy") +
  labs(
    title = "Accuracy at Different Term Sizes",
    x = "Term size",
    y = "Model"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.text = element_text(size = 10)
  )

# PR-AUC at Different Term Sizes Heatmap----------------------------------------
ggplot(data1, aes(x = Proportion, y = Model, fill = PR_AUC)) +
  geom_tile(color = "white", size = 1) +
  geom_text(aes(label = round(PR_AUC, 3)), color = "white", size = 4, fontface = "bold") +
  scale_fill_gradient2(low = "#2166ac", mid = "#FFA07A", high = "#b2182b", 
                       midpoint = 0.6, name = "PR-AUC") +
  labs(
    title = "PR-AUC at Different Term Sizes",
    x = "Term size",
    y = "Model"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.text = element_text(size = 10)
  )